/*******************************************************************************
Query_and_Clean_NETS_Data.do

This file contains the WDM SQL queries used to obtain the NETS data on employment, 
sales, and other characteristics for the year 2014. Next, we get a list of unique 
latitude-longitude pairs in NETS, and use the geoinpoly command to map them into 
M-by-M squares for M = 3, 6, 12, 24, 48. We then clean the data in order to take 
out government, education, nonprofits, and the USPS, map each establishment 
into M-by-M squares, then finally calculate employment in each enterprise and 
save only enterprises with at least 5 employees.

Note: This code assumes that an ODBC data source named "WDM" has been created to
connect to the SQL Server where the NETS database is stored (access to NETS must
be purchased from Walls & Associates).

Last updated: 6/4/2020
*******************************************************************************/

version 15
cd "C:\Plants_in_Space"
set more off
set type double
set varabbrev off

****************************QUERYING THE DATA***********************************
clear all

odbc list
// You will see a list of ODBC sources to which Stata can connect. One of them 
// should be "WDM"; otherwise check the configuration in your ODBC Data Source
// Administrator

odbc query "WDM" 
// You will see tables within the WDM data source, namely vw_db_nets_* if access 
// to the data has been purchased.
				 
// Below is the query to load in the raw data for 2014.
local query5 SELECT CASE WHEN HQDuns14=hq.DunsNumber THEN 1 ELSE 0 END AS TopLevel, hq.DunsNumber, hq.HQDuns14, sic.SIC14, emp.Emp14, emp.EmpC14, sales.Sales14, company.ZipCode, hqCo.HQCompany, hqCo.HQDuns, misc.LegalStat, misc.LevelCode, misc.FirstYear, misc.FipsCounty, misc.Latitude, misc.Longitude, DENSE_RANK() OVER (ORDER BY Latitude, Longitude) AS Loc FROM (SELECT DunsNumber, Latitude, Longitude, LegalStat, FirstYear, FipsCounty, LevelCode FROM vw_db_nets_misc) AS misc JOIN (SELECT DunsNumber, Emp14, EmpC14 FROM vw_db_nets_emp) AS emp ON misc.DunsNumber=emp.DunsNumber JOIN (SELECT DunsNumber, Sales14 FROM vw_db_nets_sales) AS sales ON emp.DunsNumber=sales.DunsNumber JOIN (SELECT DunsNumber, SIC14 FROM vw_db_nets_sic) AS sic ON emp.DunsNumber=sic.DunsNumber JOIN (SELECT DunsNumber, HQDuns14 FROM vw_db_nets_hq) AS hq ON sic.DunsNumber=hq.DunsNumber JOIN (SELECT DunsNumber, ZipCode FROM vw_db_nets_company) AS company ON emp.DunsNumber=company.DunsNumber JOIN (SELECT DunsNumber, HQDuns, HQCompany FROM vw_db_nets_hqCompany) AS hqCo ON company.DunsNumber=hqCo.DunsNumber WHERE hq.HQDuns14 IS NOT NULL AND emp.Emp14 IS NOT NULL
odbc load, exec("`query5'")

// Rename variables
ren HQDuns14 hq
ren SIC14 sic8
ren Emp14 emp
ren Sales14 sales
ren EmpC14 emp_record_code

ren HQDuns hq_last
ren ZipCode zip
ren FipsCounty county
ren DunsNumber establishment

// Generate variable corresponding to the year
gen year = 2014

drop FirstYear Loc
ren (Latitude Longitude) (latitude longitude)

duplicates drop establishment sic8 emp sales zip, force

destring emp sales hq sic8 zip county establishment hq_last, replace

// Check that all duplicate establishments have equal values for industry,
// employment, sales, and ZIP code (i.e. there should be one observation 
// per establishment left).
isid establishment 

// Longitude values need to multiplied by negative 1
replace longitude=longitude*(-1)

// Drop establishments with missing sales and employment
drop if missing(sales) | missing(emp) | missing(hq) | missing(sic8)

// Save raw output
save "Data\Raw\NETS\NETS_all_obs_2014_raw.dta", replace	

*****************MAPPING COORDINATES INTO M-BY-M SQUARES************************
// Save only a list of unique latitude-longitude pairs in NETS.
use "Data\Raw\NETS\NETS_all_obs_2014_raw.dta", clear
keep latitude longitude
by latitude longitude, sort: keep if _n == 1
save "Data\Raw\NETS\list_of_NETS_latitude_longitude_pairs.dta", replace

// Use the geoinpoly command to map latitude-longitude coordinates into squares
use "Data\Raw\NETS\list_of_NETS_latitude_longitude_pairs.dta", clear
foreach m of numlist 3 6 12 24 48 {
	geoinpoly latitude longitude using "Shapefiles\Final\US_`m'_mile_square_cells_latitude_longitude.dta", unique
	ren _ID ID_`m'
	label variable ID_`m' "ID code for `m' by `m' mile square in grid" 
	di "`m' completed"
}
save "Data\Intermediate\NETS\NETS_latitude_longitude_pairs_with_grid_IDs.dta", replace

****************************CLEANING THE DATA***********************************
// This section of code imports the raw NETS data and cleans it in order to take
// out government, education, nonprofits, and the USPS, then calculates
// remaining employment in each enterprise and saves only a list of enterprises
// with at least 5 employees, along with their grid identifiers for M = 3, 6, 12, 
// 24, and 48.
use "Data\Raw\NETS\NETS_all_obs_2014_raw.dta", replace

gen sic4 = floor(sic8 / 10000)
gen sic2 = floor(sic4 / 100)
// Take out government and their children. That is, any establishment with a
// government SIC8 code, but also any establishment whose headquarters has
// an SIC8 code corresponding to government.
generate hq_sic8 = sic8 * TopLevel
by hq, sort: egen toplevelsic8 = max(hq_sic8)
drop if toplevelsic8 >= 91000000
keep if sic8 < 91000000
drop toplevelsic8 hq_sic8

// take out education (SIC2 82)
drop if sic4 > 8199 & sic4 < 8300

// take out USPS
drop if hq == 3261245 | hq_last == 3261245

// take out nonprofits
drop if LegalStat == "J"

// Drop Puerto Rico, Alaska, Hawaii, Virgin Islands, and anything else outside
// continental U.S.
gen state = floor(county / 1000)
drop if state == 2 | state == 15 | state > 56

// Merge with latitude-longitude coordinates to get ID numbers for M-by-M 
// squares for each establishment.
merge m:1 latitude longitude using "Data\Intermediate\NETS\NETS_latitude_longitude_pairs_with_grid_IDs.dta", ///
	assert(2 3) keep(3) nogen

// Drop any establishments which are missing ALL the ID numbers for M=3, 6, 12, 
// 24, and 48. We interpret these establishments as having incorrect location 
// information.
drop if missing(ID_3) & missing(ID_6) & missing(ID_12) & missing(ID_24) & missing(ID_48)

// Now calculate total employment for each HQ, then keep only enterprises with 
// at least 5 employees
by hq, sort: egen hqemp = total(emp)
keep if hqemp >= 5

// Finally, calculate total employment in each HQ-SIC8 pair, which we interpret
// as a firm.
by hq sic8, sort: egen firm_employment = total(emp)
gen log_firm_employment = log(firm_employment)
label variable log_firm_employment "Log Total Firm Employment"

destring emp_record_code, replace
drop hq_last LegalStat LevelCode

save "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_cleaned.dta", replace

***********************************ROBUSTNESS***********************************
// First, keep only firms with at least X=10, 20, 50, 100 plants. Next, keep only
// firms in industries where the largest firm has at least X=10, 20, 50, 100 plants.
use "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_cleaned.dta", clear
by hq sic8, sort: gen hq_sic8_num_plants = _N
by sic8, sort: egen max_hq_sic8_num_plants = max(hq_sic8_num_plants)
foreach X of numlist 10 20 50 100 {
	preserve
		// Save all data
		keep if hq_sic8_num_plants >= `X'
		drop hq_sic8_num_plants max_hq_sic8_num_plants
		save "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_cleaned_firms_with_at_least_`X'_plants.dta", replace
		
		// Save only the HQ-SIC8 for these firms
		by hq sic8, sort: keep if _n == 1
		keep hq sic8
		save "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_hq-sic8_of_firms_with_at_least_`X'_plants.dta", replace
	restore
	
	preserve
		// Save all data
		keep if max_hq_sic8_num_plants >= `X'
		drop hq_sic8_num_plants max_hq_sic8_num_plants
		save "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_cleaned_industries_where_largest_firm_has_at_least_`X'_plants.dta", replace
		
		// Save only the HQ-SIC8 for these firms
		by hq sic8, sort: keep if _n == 1
		keep hq sic8
		save "Data\Intermediate\NETS\NETS_HQs_at_least_5_employees_2014_hq-sic8_of_firms_in_industries_where_largest_firm_has_at_least_`X'_plants.dta", replace
	restore
}
